View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: Crawler.java,v 1.18 2005/08/08 11:01:00 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler;
28  
29  import org.apache.commons.configuration.ConfigurationException;
30  import org.apache.log4j.Level;
31  import org.apache.log4j.Logger;
32  import org.smartcrawler.common.Context;
33  import org.smartcrawler.common.ConfigReader;
34  import org.smartcrawler.common.Link;
35  import org.smartcrawler.common.MalformedLinkException;
36  import org.smartcrawler.common.Provider;
37  import org.smartcrawler.common.ProviderFactory;
38  import org.smartcrawler.common.SCLogger;
39  
40  
41  
42  /***
43   * The class which is responsible of starting the crawling processes.
44   *
45   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
46   * @version <tt>$Revision: 1.18 $</tt>
47   */
48  public class Crawler {
49  
50      /*** Configuration parameters for the session */
51      private Context context;
52  
53      /*** The object which stores and provides the links to process */
54      private Provider provider;
55  
56      /***
57       *
58       * @param urlStr The starting url.
59       * @param configFileName The configuration file name to use for the crawling.
60       *
61       * @throws org.smartcrawler.common.MalformedLinkException
62       * @throws org.apache.commons.configuration.ConfigurationException
63       */
64      public Crawler(String urlStr, String configFileName)
65      throws MalformedLinkException, ConfigurationException {
66          Logger.getRootLogger().setLevel(Level.OFF);
67  
68          ConfigReader confReader = new ConfigReader();
69          doInit(urlStr,
70                  confReader.readConfig(configFileName));
71      }
72  
73      /***
74       *
75       * @param urlStr
76       * @param context
77       * @throws org.smartcrawler.common.MalformedLinkException
78       * @throws org.apache.commons.configuration.ConfigurationException
79       */
80      public Crawler(String urlStr, Context context)
81      throws MalformedLinkException, ConfigurationException {
82          doInit(urlStr, context);
83      }
84  
85      /***
86       *
87       * @param urlStr
88       * @param context
89       * @throws org.smartcrawler.common.MalformedLinkException
90       * @throws org.apache.commons.configuration.ConfigurationException
91       */
92      public Crawler(String urlStr)
93      throws MalformedLinkException, ConfigurationException {
94          Context defaultConf = new Context();
95          doInit(urlStr, defaultConf);
96      }
97  
98      /***
99       *
100      * @param urlStr
101      * @param context
102      * @throws org.smartcrawler.common.MalformedLinkException
103      * @throws org.apache.commons.configuration.ConfigurationException
104      */
105     protected void doInit(String urlStr, Context context)
106     throws MalformedLinkException, ConfigurationException {
107 
108         this.context = context;
109 
110         SCLogger.initialize(context.getLoggers());
111 
112         Link initial = new Link(urlStr);
113         this.context.setInitialLink(initial);
114 
115         this.provider = ProviderFactory.instance().create();
116         provider.store(initial);
117 
118 
119     }
120 
121     /***
122      * Starts the {@link org.smartcrawler.DownloadEngine} threads by using the
123      * configuration settings supplied by the
124      * {@link org.smartcrawler.common.SiteConfiguration}.
125      *
126      */
127     public void startEngines() {
128         int enginesThreadNum = 1;
129         enginesThreadNum = this.context.getEngineThreadNumber();
130         for (int i = 0; i < enginesThreadNum; i++) {
131             DownloadEngine eng = new DownloadEngine(context);
132             eng.setName(" [Engine-" + (i + 1) + "] ");
133             eng.start();
134         }
135         /*
136         try {
137             Thread.sleep(2000);
138         } catch(Exception e) {}
139          */
140     }
141 
142     /***
143      * The main method
144      *
145      * @param args The command line arguments.
146      */
147     public static void main(String[] args) {
148         String urlStr = null;
149 
150         String configFileName = "bin/conf/smartcrawler-config.xml";
151 
152         try {
153             if (args.length > 0) {
154                 urlStr = args[0];
155             } else {
156                 System.out.println("Please specify a valid starting url.");
157             }
158             if (args.length > 1) {
159                 configFileName = args[1];
160             } else {
161                 String home = System.getProperty("smartcrawler.home");
162                 String sep = System.getProperty("file.separator");
163                 if (home != null) {
164                     configFileName = home + sep + configFileName;
165                 }
166             }
167             new Crawler(urlStr, configFileName).startEngines();
168 
169         } catch (MalformedLinkException e) {
170             System.out.println("Invalid initial link! " + urlStr);
171         } catch (Exception e) {
172             System.out.println("Generic error: " + e.getMessage());
173         }
174     }
175 }